import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import psycopg2
import random
import seaborn as sns
from psycopg2.extras import DictCursor
from kneed import KneeLocator
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
plt.rcParams.update({'font.size': 14})
pd.set_option('display.float_format', lambda x: '%.4f' % x)
postgres_connections_file = '/Users/chrispatrick/secrets/postgres_credentials.json'
with open(postgres_connections_file, 'r') as f:
postgres_connection_params = json.load(f)
connection_params = postgres_connection_params['PRODUCTION']
class Postgres():
def __init__(self):
self.conn = None
def open(self):
self.conn = psycopg2.connect(
host=connection_params['host'],
user=connection_params['user'],
password=connection_params['password'],
dbname=connection_params['dbname'],
sslmode='require'
)
def close(self):
self.conn.close()
def fetch(self, qry):
with self.conn.cursor(cursor_factory=DictCursor) as cursor:
cursor.execute(qry)
result = cursor.fetchall()
return [dict(row) for row in result]
def execute(self, qry):
with self.conn.cursor(cursor_factory=DictCursor) as cursor:
cursor.execute(qry)
def run_query(qry):
pg = Postgres()
pg.open()
result = pg.fetch(qry)
pg.close()
return result
1. How many orders were completed in 2018? (Note: We operate in US/Eastern time zone)
a1_qry = '''
SELECT
COUNT(DISTINCT order_id)
FROM
orders
WHERE
order_timestamp AT TIME ZONE 'utc' AT TIME ZONE 'America/New_York' >= '2018-01-01'
AND order_timestamp AT TIME ZONE 'utc' AT TIME ZONE 'America/New_York' < '2019-01-01'
'''
run_query(a1_qry)
[{'count': 9228}]
2. How many orders were completed in 2018 containing at least 10 units?
NOTE: I used SELECT DISTINCT in the CTE as I noticed there were duplicated records in the line_items table. I removed the duplicates when ingesting the data into Postgres, but the solution below will work if that step was not taken.
a2_qry = '''
WITH unique_line_items AS
(
SELECT DISTINCT
line_items.line_item_id,
orders.order_id,
quantity
FROM
line_items
INNER JOIN
orders
ON
line_items.order_id = orders.order_id
WHERE
order_timestamp AT TIME ZONE 'utc' AT TIME ZONE 'America/New_York' >= '2018-01-01'
AND order_timestamp AT TIME ZONE 'utc' AT TIME ZONE 'America/New_York' < '2019-01-01'
)
SELECT
COUNT(DISTINCT order_id)
FROM
(
SELECT
order_id,
SUM(quantity) AS quantity
FROM
unique_line_items
GROUP BY
order_id
HAVING
SUM(quantity) >= 10
) orders_with_more_than_10
'''
run_query(a2_qry)
[{'count': 5151}]
3. How many customers have ever purchased a medium sized sweater with a discount?
a3_qry = '''
SELECT
COUNT(DISTINCT orders.customer_uid)
FROM
line_items
INNER JOIN
orders
ON
line_items.order_id = orders.order_id
WHERE
product_category = 'Sweater'
AND size = 'M'
'''
run_query(a3_qry)
[{'count': 1131}]
4. How profitable was our most profitable month?
NOTE: After applying the discount to the customer costs, I rounded down to the nearest cent. If JiffyShirts uses "real" rounding, the following can be used as the last step in the qry:
SELECT
order_month,
CAST(
SUM(
ROUND((non_discounted_rate * (price + shipping_revenue))::NUMERIC, 2) -
(supplier_cost + shipping_cost)
)
AS DECIMAL(16,2)
) AS profit
FROM
order_prices
The profit for the most profitable month (October 2020) would be $63,612.08 with that change applied, a difference of only about 5 dollars.
a4_qry = '''
WITH line_item_data AS
(
SELECT DISTINCT ON(line_item_id)
order_id,
line_item_id,
quantity * COALESCE(selling_price, 0) AS line_item_price,
quantity * COALESCE(supplier_cost, 0) AS line_item_supplier_cost
FROM
line_items
),
order_data AS
(
SELECT
order_id,
order_timestamp AT TIME ZONE 'utc' AT TIME ZONE 'America/New_York' AS order_timestamp_eastern,
1 - discount AS non_discounted_rate,
shipping_revenue,
shipping_cost
FROM
orders
),
order_prices AS
(
SELECT
CONCAT(
TO_CHAR(DATE_TRUNC('month', order_timestamp_eastern), 'Mon'),
' ',
EXTRACT(year from order_timestamp_eastern)::TEXT
) AS order_month,
line_item_data.order_id,
non_discounted_rate,
shipping_revenue,
shipping_cost,
SUM(line_item_price)::DECIMAL AS price,
SUM(line_item_supplier_cost)::DECIMAL AS supplier_cost
FROM
line_item_data
INNER JOIN
order_data
ON
line_item_data.order_id = order_data.order_id
GROUP BY
1,2,3,4,5
)
SELECT
order_month,
CAST(
SUM(
FLOOR( non_discounted_rate * (price + shipping_revenue) * 100) / 100 -
(supplier_cost + shipping_cost)
)
AS DECIMAL(16, 2)
)
AS profit
FROM
order_prices
GROUP BY 1
ORDER BY 2 DESC
LIMIT 1
'''
run_query(a4_qry)
[{'order_month': 'Oct 2020', 'profit': Decimal('63607.63')}]
5. What is the return rate for business vs. non-business customers?
a5_qry = '''
SELECT
is_business,
ROUND(
(
COUNT(DISTINCT CASE WHEN returned = 'true' THEN order_id END)::FLOAT /
COUNT(DISTINCT order_id)
)::NUMERIC
, 4) AS return_rate
FROM
customers
INNER JOIN
orders
ON
customers.customer_uid = orders.customer_uid
GROUP BY 1
'''
run_query(a5_qry)
[{'is_business': False, 'return_rate': Decimal('0.0492')},
{'is_business': True, 'return_rate': Decimal('0.0674')}]
Customer Segmentation
How should we segment our customers to align different kinds of marketing messaging and offers?
Key customer metrics:
Key findings:
customer_qry = '''
WITH line_item_data AS
(
SELECT DISTINCT ON(line_item_id)
order_id,
line_item_id,
quantity * COALESCE(selling_price, 0) AS line_item_price,
quantity * COALESCE(supplier_cost, 0) AS line_item_supplier_cost
FROM
line_items
),
order_data AS
(
SELECT
customer_uid,
order_id,
order_timestamp AT TIME ZONE 'utc' AT TIME ZONE 'America/New_York' AS order_timestamp_eastern,
1 - discount AS non_discounted_rate,
shipping_revenue,
shipping_cost
FROM
orders
),
order_prices AS
(
SELECT
customer_uid,
line_item_data.order_id,
non_discounted_rate,
shipping_revenue,
shipping_cost,
SUM(line_item_price)::DECIMAL AS price,
SUM(line_item_supplier_cost)::DECIMAL AS supplier_cost
FROM
line_item_data
INNER JOIN
order_data
ON
line_item_data.order_id = order_data.order_id
GROUP BY
1,2,3,4,5
),
profit_by_customer AS (
SELECT
customer_uid,
CAST(
SUM(
FLOOR( non_discounted_rate * (price + shipping_revenue) * 100) / 100 -
(supplier_cost + shipping_cost)
)
AS DECIMAL(16, 2)
) AS profit
FROM
order_prices
GROUP BY 1
)
SELECT
customers.customer_uid,
is_business,
has_account,
acquisition_channel,
COUNT(DISTINCT order_id) AS order_count,
CASE
WHEN COUNT(DISTINCT order_id) > 1 THEN true
ELSE false
END AS multiple_orders,
ROUND(
(
COUNT(DISTINCT CASE WHEN returned = 'true' THEN order_id END)::FLOAT /
COUNT(DISTINCT order_id)
)::NUMERIC
, 4) AS return_rate,
MAX(order_timestamp)::date - MIN(order_timestamp)::date
AS days_between_first_and_last_order,
CASE
WHEN COUNT(DISTINCT order_id) = 1 THEN 0
ELSE
CAST(
(MAX(order_timestamp)::date - MIN(order_timestamp)::date)::FLOAT /
NULLIF((COUNT(DISTINCT order_id) - 1), 0)
AS DECIMAL(16,2)
)
END AS average_days_between_orders,
ROUND(
(
COUNT(DISTINCT CASE WHEN discount > 0 THEN order_id END)::FLOAT /
COUNT(DISTINCT order_id)
)::NUMERIC
, 2) AS pct_discounted,
ROUND(AVG(discount)::NUMERIC, 4) AS avg_discount,
SUM(profit) AS lifetime_profit,
CAST(
SUM(profit) / COUNT(DISTINCT order_id)
AS DECIMAL(16,2)
) AS avg_order_profit
FROM
customers
INNER JOIN
orders
ON
customers.customer_uid = orders.customer_uid
INNER JOIN
profit_by_customer
ON
customers.customer_uid = profit_by_customer.customer_uid
GROUP BY 1
ORDER BY CAST(
SUM(profit) / COUNT(DISTINCT order_id)
AS DECIMAL(16,2)
) DESC
'''
customers = run_query(customer_qry)
orig_df = pd.DataFrame.from_records(customers)
df = orig_df.copy()
df.head()
| customer_uid | is_business | has_account | acquisition_channel | order_count | multiple_orders | return_rate | days_between_first_and_last_order | average_days_between_orders | pct_discounted | avg_discount | lifetime_profit | avg_order_profit | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3069349651afb03593af7d090535f5dc | False | False | paid search | 335 | True | 0.0030 | 1785 | 5.34 | 0.63 | 0.0427 | 7203163.30 | 21501.98 |
| 1 | 8f027f83d638ebf16d9b06493d78c0eb | True | True | paid search | 258 | True | 0.0969 | 1191 | 4.63 | 0.59 | 0.0481 | 4705313.70 | 18237.65 |
| 2 | 74cb87bd34763f1a437fab63ad017646 | False | True | paid search | 123 | True | 0.0000 | 1220 | 10.00 | 0.77 | 0.0809 | 1778095.38 | 14456.06 |
| 3 | 64f9abc25d599e49cfddeb3de7774ca0 | True | True | paid search | 285 | True | 0.1193 | 1727 | 6.08 | 0.47 | 0.0329 | 3985123.65 | 13982.89 |
| 4 | 85164ec0e55513b57af6c9395230aa42 | True | True | paid search | 247 | True | 0.0486 | 1046 | 4.25 | 0.55 | 0.0425 | 3063708.96 | 12403.68 |
fig = px.scatter_matrix(
df.drop('customer_uid', axis=1),
width=1200,
height=1600
)
fig.show()
df.dtypes
customer_uid object is_business bool has_account bool acquisition_channel object order_count int64 multiple_orders bool return_rate object days_between_first_and_last_order int64 average_days_between_orders object pct_discounted object avg_discount object lifetime_profit object avg_order_profit object dtype: object
# Update data types
# Change decimals to floats
for col in [
'return_rate',
'average_days_between_orders',
'pct_discounted',
'avg_discount',
'lifetime_profit',
'avg_order_profit'
]:
df[col] = df[col].astype(float)
# Create dummy values for acquisition channel, which will be one-hot encoded
channels = df.acquisition_channel
df = df.join(pd.get_dummies(channels))
df.drop('acquisition_channel', axis=1, inplace=True)
# Change booleans to one hot encoding
for col in df.columns:
if df[col].dtype == 'bool':
df[col] = df[col].astype(np.int)
df.head()
| customer_uid | is_business | has_account | order_count | multiple_orders | return_rate | days_between_first_and_last_order | average_days_between_orders | pct_discounted | avg_discount | lifetime_profit | avg_order_profit | coupon aggregator | direct | organic search | paid search | paid social | referral | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3069349651afb03593af7d090535f5dc | 0 | 0 | 335 | 1 | 0.0030 | 1785 | 5.3400 | 0.6300 | 0.0427 | 7203163.3000 | 21501.9800 | 0 | 0 | 0 | 1 | 0 | 0 |
| 1 | 8f027f83d638ebf16d9b06493d78c0eb | 1 | 1 | 258 | 1 | 0.0969 | 1191 | 4.6300 | 0.5900 | 0.0481 | 4705313.7000 | 18237.6500 | 0 | 0 | 0 | 1 | 0 | 0 |
| 2 | 74cb87bd34763f1a437fab63ad017646 | 0 | 1 | 123 | 1 | 0.0000 | 1220 | 10.0000 | 0.7700 | 0.0809 | 1778095.3800 | 14456.0600 | 0 | 0 | 0 | 1 | 0 | 0 |
| 3 | 64f9abc25d599e49cfddeb3de7774ca0 | 1 | 1 | 285 | 1 | 0.1193 | 1727 | 6.0800 | 0.4700 | 0.0329 | 3985123.6500 | 13982.8900 | 0 | 0 | 0 | 1 | 0 | 0 |
| 4 | 85164ec0e55513b57af6c9395230aa42 | 1 | 1 | 247 | 1 | 0.0486 | 1046 | 4.2500 | 0.5500 | 0.0425 | 3063708.9600 | 12403.6800 | 0 | 0 | 0 | 1 | 0 | 0 |
# Check dtypes
df.dtypes
customer_uid object is_business int64 has_account int64 order_count int64 multiple_orders int64 return_rate float64 days_between_first_and_last_order int64 average_days_between_orders float64 pct_discounted float64 avg_discount float64 lifetime_profit float64 avg_order_profit float64 coupon aggregator uint8 direct uint8 organic search uint8 paid search uint8 paid social uint8 referral uint8 dtype: object
# Normalize columns with numeric data
convert_cols = [
'order_count',
'return_rate',
'days_between_first_and_last_order',
'average_days_between_orders',
'pct_discounted',
'avg_discount',
'lifetime_profit',
'avg_order_profit'
]
df_num = df[convert_cols].copy()
df_normalized = (df_num - df_num.mean()) / df_num.std()
df_normalized
features = df.copy()
features.drop(convert_cols, axis=1, inplace=True)
features = features.join(df_normalized)
features = features.loc[:, features.columns != 'customer_uid'].copy()
features.head()
| is_business | has_account | multiple_orders | coupon aggregator | direct | organic search | paid search | paid social | referral | order_count | return_rate | days_between_first_and_last_order | average_days_between_orders | pct_discounted | avg_discount | lifetime_profit | avg_order_profit | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 20.1766 | -0.2447 | 4.5624 | -0.3539 | 0.9326 | 0.5962 | 53.1537 | 30.5921 |
| 1 | 1 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 15.4617 | 0.3004 | 2.8655 | -0.3591 | 0.8275 | 0.7454 | 34.6942 | 25.9081 |
| 2 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 7.1954 | -0.2621 | 2.9483 | -0.3199 | 1.3005 | 1.6521 | 13.0617 | 20.4819 |
| 3 | 1 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 17.1150 | 0.4304 | 4.3967 | -0.3485 | 0.5122 | 0.3253 | 29.3719 | 19.8029 |
| 4 | 1 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 14.7882 | 0.0200 | 2.4513 | -0.3619 | 0.7224 | 0.5906 | 22.5625 | 17.5369 |
kmeans_kwargs = {
"init": "random",
"n_init": 10,
"max_iter": 300,
"random_state": 42
}
sse = []
silhouette_coefficients = []
for k in range(1, 30):
kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
kmeans.fit(features)
sse.append(kmeans.inertia_)
if k > 1:
score = silhouette_score(features, kmeans.labels_)
silhouette_coefficients.append(score)
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)
plt.style.use("fivethirtyeight")
plt.plot(range(1, 30), sse)
plt.xticks(range(1, 30))
plt.xlabel("Number of Clusters")
plt.ylabel("SSE")
plt.show()
kl = KneeLocator(
range(1, 30), sse, curve="convex", direction="decreasing"
)
kl.elbow
8
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)
plt.style.use("fivethirtyeight")
plt.plot(range(2, 30), silhouette_coefficients)
plt.xticks(range(2, 30))
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Coefficient")
plt.show()
# Choose 6 clusters based on these 2 plots
random.seed(10)
kmeans = KMeans(
init="random",
n_clusters=6,
n_init=10,
max_iter=300,
random_state=42
)
kmeans.fit(features)
df['label'] = [str(label) for label in kmeans.labels_]
features['label'] = [str(label) for label in kmeans.labels_]
df.label.value_counts()
1 4836 5 1711 3 1092 0 439 2 331 4 48 Name: label, dtype: int64
df.drop('customer_uid', axis=1).groupby('label').mean().reset_index()
| label | is_business | has_account | order_count | multiple_orders | return_rate | days_between_first_and_last_order | average_days_between_orders | pct_discounted | avg_discount | lifetime_profit | avg_order_profit | coupon aggregator | direct | organic search | paid search | paid social | referral | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0.2642 | 0.4738 | 2.9203 | 1.0000 | 0.0248 | 842.1708 | 513.9810 | 0.2748 | 0.0216 | 288.7133 | 88.3423 | 0.0000 | 0.5604 | 0.0638 | 0.2597 | 0.0296 | 0.0866 |
| 1 | 1 | 0.2217 | 0.3724 | 2.0434 | 0.3290 | 0.0064 | 42.8757 | 18.1146 | 0.0405 | 0.0022 | 185.7061 | 38.1967 | 0.2713 | 0.1946 | 0.2018 | 0.1917 | 0.1098 | 0.0308 |
| 2 | 2 | 0.2447 | 0.5498 | 2.6918 | 0.5015 | 0.8119 | 61.4320 | 27.8973 | 0.2912 | 0.0228 | 1188.6669 | 98.7273 | 0.2205 | 0.2810 | 0.1239 | 0.2175 | 0.0785 | 0.0785 |
| 3 | 3 | 0.6557 | 0.8498 | 21.2363 | 1.0000 | 0.0454 | 767.1328 | 79.1606 | 0.3147 | 0.0237 | 26158.4982 | 667.5569 | 0.0000 | 0.2271 | 0.0073 | 0.7299 | 0.0000 | 0.0357 |
| 4 | 4 | 0.8333 | 0.9583 | 161.8542 | 1.0000 | 0.0805 | 1218.2083 | 8.4060 | 0.4598 | 0.0356 | 1232929.2469 | 6931.4479 | 0.0000 | 0.0000 | 0.0000 | 1.0000 | 0.0000 | 0.0000 |
| 5 | 5 | 0.4120 | 0.4921 | 1.9982 | 0.3647 | 0.0106 | 55.8948 | 26.7542 | 0.9047 | 0.0722 | 516.0796 | 129.4444 | 0.1391 | 0.2864 | 0.0538 | 0.4091 | 0.0590 | 0.0526 |
df.drop('customer_uid', axis=1).groupby('label').median().reset_index()
| label | is_business | has_account | order_count | multiple_orders | return_rate | days_between_first_and_last_order | average_days_between_orders | pct_discounted | avg_discount | lifetime_profit | avg_order_profit | coupon aggregator | direct | organic search | paid search | paid social | referral | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0.0000 | 0.0000 | 3.0000 | 1.0000 | 0.0000 | 790.0000 | 416.7500 | 0.2000 | 0.0080 | 163.9000 | 60.1400 | 0.0000 | 1.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 |
| 1 | 1 | 0.0000 | 0.0000 | 1.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 20.7100 | 19.4450 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 |
| 2 | 2 | 0.0000 | 1.0000 | 2.0000 | 1.0000 | 1.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 56.7600 | 37.6900 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 |
| 3 | 3 | 1.0000 | 1.0000 | 14.0000 | 1.0000 | 0.0000 | 696.0000 | 54.3950 | 0.2900 | 0.0198 | 5242.1450 | 383.4550 | 0.0000 | 0.0000 | 0.0000 | 1.0000 | 0.0000 | 0.0000 |
| 4 | 4 | 1.0000 | 1.0000 | 147.0000 | 1.0000 | 0.0154 | 1183.0000 | 7.4000 | 0.4700 | 0.0314 | 733445.4600 | 5650.5950 | 0.0000 | 0.0000 | 0.0000 | 1.0000 | 0.0000 | 0.0000 |
| 5 | 5 | 0.0000 | 0.0000 | 1.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 1.0000 | 0.0550 | 75.4100 | 66.1500 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 |
palette = {
'5': '#359B73',
'4': '#2271B2',
'3': '#3DE7B9',
'2': '#F748A5',
'1': '#000000',
'0': '#D55E00'
}
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)
sns.scatterplot(x='avg_order_profit', y='order_count', data=df, palette=palette, hue='label')
plt.show()
for label in palette.keys():
print(f'Count of customers for group {label}: {df.label.value_counts()[label]}')
sns.scatterplot(
x='avg_order_profit',
y='order_count',
data=df.loc[df.label==label],
palette=palette,
hue='label'
)
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)
plt.show()
Count of customers for group 5: 1711
Count of customers for group 4: 48
Count of customers for group 3: 1092
Count of customers for group 2: 331
Count of customers for group 1: 4836
Count of customers for group 0: 439
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)
sns.scatterplot(
x='avg_order_profit',
y='order_count',
data=df.loc[df.label.isin(['0', '1', '2', '3', '5'])],
palette=palette,
hue='label'
)
plt.show()
scale_columns = [
'order_count',
'days_between_first_and_last_order',
'average_days_between_orders',
'lifetime_profit',
'avg_order_profit'
]
subset = df[scale_columns].copy()
subset = (subset - subset.min()) / (subset.max() - subset.min())
polar = subset.join(df.loc[:, [col for col in df if col not in scale_columns]])
polar = polar.drop('customer_uid', axis=1).groupby('label').mean().reset_index()
fig = px.line_polar(
pd.melt(polar, id_vars=['label']),
r='value',
theta='variable',
color='label',
color_discrete_map=palette,
line_close=True,
height=800,
width=1400
)
fig.show()